
*************************************************************************************************************************************;
*  	Code Name: Main_code_RFS.sas
  	Authors: Per Sid�n and Thomas Jansson (Sveriges Riksbank)
	Created: March 25, 2013
	Updated: June 20, 2023

* 	Extract the individuals and variables and form a balanced panel that will be used in the regression analysis. 
   	In every step we document how many observations that are removed from the data.
   	We are using both the regular LINDA sample ("f") and the LINDA immigrant sample ("i") from Statistics Sweden.
*************************************************************************************************************************************;

* Clear everything;
DM 'CLEAR LOG';
DM 'CLEAR OUTPUT';
Proc datasets LIB = work KILL;

* Select libraries;
libname original 'DEFINE PATH HERE' access = readonly; 
libname kuru 'DEFINE PATH HERE' access = readonly;	
	
libname main 'DEFINE PATH HERE';
libname bal 'DEFINE PATH HERE';
libname out 'DEFINE PATH HERE';
libname temp 'DEFINE PATH HERE';
libname tempinc 'DEFINE PATH HERE';
libname grund 'DEFINE PATH HERE';
libname keys 'DEFINE PATH HERE';
libname save 'DEFINE PATH HERE' ;

* Select paths;
filename pathnew "DEFINE PATH HERE";
filename pathgru "DEFINE PATH HERE";

* Select old libraries (used to compare with previous versions);
libname mainold 'DEFINE PATH HERE' access = readonly;
libname saveold 'DEFINE PATH HERE' access = readonly;
libname mainprev 'DEFINE PATH HERE' access = readonly;
libname tempprev 'DEFINE PATH HERE' access = readonly;
libname balprev 'DEFINE PATH HERE' access = readonly;
libname outprev 'DEFINE PATH HERE' access = readonly;
libname grundpre 'DEFINE PATH HERE' access = readonly;
*************************************************************************************************************************************;

* Define dates;
%let date = 230423R;
%let lastdate = 200429R;

* Define which variables to include in the analysis dataset;
%let entrvars = 		sni_2 sni_entr_2 sni_1 sni_entr_1 sni sni_entr sni_entr_hh sni_entr_alt_hh sni_2_1993 sni_2_1995 ssyk
						enterpr_inc_hh enterpr_inc_hh1999-enterpr_inc_hh2007
						unemploybenefits1993-unemploybenefits2007 welfare welfare_fam interest_fam;

%let robotvars = 		muni muni1993 muni1999
						inchh_earned inchh_broad inchh_assessed inchh_taxable inchh_dis_nocap inc_earned inc_broad inc_assessed inc_taxable 
						inchh_earned1995-inchh_earned1998 inchh_disp1994-inchh_disp1998
						edumajor_1_99 edumajor_2_99 edumajor_3_99 years_edu1993
						return_lag1;

%let balsvarsorg = 		id idhh year birthcountry citizencountry
			    		birthcountry_code citizencountry_code age immigr_year hh_size
			   	 		sex retired unemployed employed student married inc inchh lan finsector 
						reg_immi_rat sp_birthcountry_code sp_citizencountry_code sampled_f sampled_i 
						edulev eduorient hh_children familyrelation govsector
						hhstocks hhmutualother hhstocksmutual hhdebt hhhouse hhbank hhfinw
						hhcoop hhrealassets hhtotalassets hhnetw hhdebtstudies hhrisky
						lan_group1-lan_group11;

%let allvars =  		&balsvarsorg &robotvars &entrvars parish partnerdummy over18 singleparentwithdepchild hh_adults;

%let yugo = ('BA','HR','MK','YU','CS','SI','RS','ME'); 
%let czech = ('CZ','SK','CE');						
%let nocountry = ('XU','OB','XO','XS','XU','ZZ');
*************************************************************************************************************************************;

***************************************;
** STEP 1: BACKGROUND VARIABLE CODES **;
***************************************;

* Run codes;
%INCLUDE pathgru(CreateCountryKey);
%INCLUDE pathgru(CreateEduKey);
%INCLUDE pathgru(Part1InitializeLinda);
%INCLUDE pathgru(Part2IncludeAssets);
%INCLUDE pathgru(Part3CreateHHVariables);
%INCLUDE pathgru(Part4RegionalImmigrantRatio);

* Change status of some folders to read only;
libname main 'DEFINE PATH HERE' access = readonly;
libname keys 'DEFINE PATH HERE' access = readonly;

* Save in temp folder and keep only relevant variables (listed above);
%macro m0ab(f_or_i);
%do I = 1999 %to 2007;

data temp.linda&f_or_i&I;
	set main.linda&f_or_i&I;
		keep &balsvarsorg region;
run;

proc sort data = temp.linda&f_or_i&I;
	by id;
run;

%end;

%mend;
%m0ab(f); *regular LINDA sample;
%m0ab(i); *immigrant LINDA sample;

* Run additional codes to generate necessary variables for the empirical analysis;
%INCLUDE pathnew(Part1_Lindadata);
	proc datasets LIB = work KILL; run;
%INCLUDE pathnew(Part2_Edu_industry);
	proc datasets LIB = work KILL; run;
%INCLUDE pathnew(Part3_Income_process);
	proc datasets LIB = work KILL; run;
%INCLUDE pathnew(Part4_Wealth_returns);
	proc datasets LIB = work KILL; run;

* Merge all necessary datasets and save;
%macro m0a(f_or_i);
%do I = 1999 %to 2007;

data temp.linda3&f_or_i&I;
	merge 
				temp.linda&f_or_i&I(in = in_linda) 
		/*1*/	grund.bostadsort&f_or_i 
		/*1*/	grund.education&f_or_i&I(rename = (nybidnr = id))
		/*1*/	grund.pensionsavingFAM&f_or_i&I 
		/*1*/	grund.birthcountry&f_or_i (keep = id birthcountry_gr)
		/*1*/	grund.snikod&f_or_i&I
		/*1*/	grund.incomes&f_or_i&I
		/*1*/	grund.early_inc&f_or_i
		
		/*1*/	grund.entreprenourFAM&f_or_i&I 
		/*1*/	grund.entreprenourFAMhist&f_or_i 
		/*1*/	grund.unemployment&f_or_i 
		/*1*/	grund.industry&f_or_i&I(keep = id sni_2 sni_entr_2 sni_1 sni_entr_1 sni sni_entr)
		/*1*/	grund.industry_2_1993&f_or_i
		/*1*/	grund.industry_2_1995&f_or_i
		/*1*/	grund.entr_snihh&f_or_i&I

		/*1*/	grund.welfare&f_or_i&I
		/*1*/	grund.interest&f_or_i&I
		/*1*/	temp.arrivalyears&f_or_i
		/*1*/	grund.edu_arrival&f_or_i
		/*1*/	grund.occupations&f_or_i

		/*3*/	temp.edu_orient1999
		/*4*/	save.returns&f_or_i&I
			;
	by id; 
		if in_linda = 1;
				parish = region;						* Rename region variable as parish;
				muni = substr(parish,1,4); 				* Create municipality variable;
				partnerdummy = (familyrelation = '1');	* Role in the family (1=Partner,2=Single parent,3=Child,4=Single,0=Others);
				over18 = (age >= 18);					* Adult indicator variable;
				singleparentwithdepchild = (familyrelation = '2' and hh_children > 0); *Single parent with child;
				hhdebt = hhdebt - hhdebtstudies; 		* Exclude government student debt from debt variable;
   				hhhouse = hhhouse + hhcoop; 			* Include the value of coop apartments in home value;
				hh_adults = hh_size - hh_children;		* Number of adults in the household;
	keep &allvars;
run;

proc datasets LIB = work KILL;

%end;
%mend;
%m0a(f);
%m0a(i);
*************************************************************************************************************************************;

************************************************************;
** STEP 2: MAIN CODE FOR CONSTRUCTION OF ANALYSIS DATASET **;
************************************************************;

* First append the data for all years and throw away uninteresting variables.
  Create some new variables.
  Construct discard history.;

%macro m4ca;

%let y = 1999;

data bal.lindaf;
	set temp.linda3f&y(keep = &allvars);
run;

data bal.lindai;
	set temp.linda3i&y(keep = &allvars);
run;

%do y = 2000 %to 2007;	
	
	data temp.lindafapp;
		set temp.linda3f&y(keep = &allvars);
	run;

	data temp.lindaiapp;
		set temp.linda3i&y(keep = &allvars);
	run;

	proc append base = bal.lindaf data = temp.lindafapp;
	run;
	
	proc append base = bal.lindai data = temp.lindaiapp;
	run;	

%end;

data _NULL_;
	set bal.lindaf(keep = id) end = last;
	if last then call symput('fsize',_N_);
run;

data _NULL_;
	set bal.lindai(keep = id) end = last;
	if last then call symput('isize',_N_);
run;

data bal.discardhistory;
	length discardinfo $ 100 total_size swe_size immigr_size total_change swe_change immigr_change total_prop swe_prop immigr_prop 8;
	label discardinfo = 'Sample change';
	label total_size = 'Total sample size';
	label swe_size = 'Swedish sample size';
	label immigr_size = 'Immigrant sample size';
	label total_change = 'Percentual decrease in total sample size';
	label swe_change = 'Percentual decrease in Swedish sample size';
	label immigr_change = 'Percentual decrease in Immigrant sample size';
	label total_prop = 'Proportion of total original sample still remaining';
	label swe_prop = 'Proportion of Swedish original sample still remaining';
	label immigr_prop = 'Proportion of Immigrant original sample still remaining';
	discardinfo = 'Original sample';
	swe_size = &fsize;
	immigr_size = &isize;
	total_size = swe_size + immigr_size;
	total_change = 0;
	swe_change = 0;
	immigr_change = 0;
	total_prop = 1;
	swe_prop = 1;
	immigr_prop = 1;	
run;

%mend;
%m4ca;

%macro appendDiscardHistory(discardinfo,swe_size,immigr_size);
data _NULL_;
	set bal.discardhistory;
	if _N_ = 1 then do;
		call symput('totorsize',total_size);
		call symput('sweorsize',swe_size);
		call symput('immigrorsize',immigr_size);
	end;
run;
data bal.discardhistory;
	set bal.discardhistory end = last;
	output;
	if last then do;
		discardinfo = &discardinfo;
		total_change = 1 - (&swe_size + &immigr_size)/total_size;
		swe_change = 1 - &swe_size/swe_size;
		immigr_change = 1 - &immigr_size/immigr_size;
		total_size = &swe_size + &immigr_size;
		swe_size = &swe_size;
		immigr_size = &immigr_size;		
		total_prop = total_size/ &totorsize;
		swe_prop = swe_size/ &sweorsize;
		immigr_prop = immigr_size/ &immigrorsize;
		output;
	end;
run;	
%mend;

* Sort and decide hh-head for every year;
%let isize = 0;
%let fsize = 0;

%macro m4cb(f_or_i);

proc sort data = bal.linda&f_or_i;
	by year idhh partnerdummy singleparentwithdepchild over18 inc age id;
run;

data temp.heads&f_or_i;
	set bal.linda&f_or_i(keep = year idhh id);
	by year idhh;
	if last.idhh then head = 1;
	else head = 0;
run;
	
proc sort data = temp.heads&f_or_i;
	by id idhh;
run;

proc means data = temp.heads&f_or_i noprint;
	by id idhh;
	output out = temp.sumheads&f_or_i sum(head) = nheadyear;
run;

data temp.remhh&f_or_i;
	set temp.sumheads&f_or_i;
	if nheadyear > 0 and _FREQ_ < 9;
run;

proc sort data = temp.sumheads&f_or_i;
	by idhh;
run;

proc sort data = temp.remhh&f_or_i;
	by idhh;
run;

data temp.sumheads&f_or_i;
	merge temp.sumheads&f_or_i(in = left) temp.remhh&f_or_i(in = right keep = idhh);
	by idhh;
	if left and ^right and nheadyear >= 5;
run;

proc sort data = bal.linda&f_or_i;
	by id year;
run;

proc sort data = temp.sumheads&f_or_i;
	by id;
run;

data bal.linda&f_or_i;
	merge bal.linda&f_or_i temp.sumheads&f_or_i(in = right keep = id);
	by id;
	if right;
run;

data _NULL_;
	set bal.linda&f_or_i(keep = id) end = last;
	if last then call symput("%scan(&f_or_i,1)size",_N_);
run;

%mend;
%m4cb(i);
%m4cb(f);
%appendDiscardHistory('Keep only hhs where the head one year exists all years. Only one observation per hh.',&fsize,&isize);


* Replace partially missing edulev and remove them with missing edulev all years;
%macro m4cc(f_or_i);

data temp.missedulev&f_or_i;
	set bal.linda&f_or_i;
	keep id year edulev;
run;

proc sort data = temp.missedulev&f_or_i;
	by id descending year;
run;

* Replace backward;
data temp.missedulev&f_or_i;
	set temp.missedulev&f_or_i;
	by id;
	length edulevbeforemiss $ 2;
	retain edulevbeforemiss;

	if first.id then do;
		edulevbeforemiss = '9';
	end;	

	if edulev = '9' then do;
		if edulevbeforemiss ^= '9' then edulev = edulevbeforemiss;
	end;
	else edulevbeforemiss = edulev;
	drop edulevbeforemiss;
run;

proc sort data = temp.missedulev&f_or_i;
	by id year;
run;

* Replace forward;
data temp.missedulev&f_or_i;
	set temp.missedulev&f_or_i;
	by id;
	length edulevbeforemiss $ 2;
	retain edulevbeforemiss;

	if first.id then do;
		edulevbeforemiss = '9';
	end;	

	if edulev = '9' then do;
		if edulevbeforemiss ^= '9' then edulev = edulevbeforemiss;
	end;
	else edulevbeforemiss = edulev;
	drop edulevbeforemiss;
run;

data temp.missedulev&f_or_i;
	set temp.missedulev&f_or_i;
	where edulev ^= '9';
run;

data bal.linda&f_or_i;
	merge temp.missedulev&f_or_i(in = left) bal.linda&f_or_i(drop = edulev);
	by id year;
	if left;
run;

data _NULL_;
	set bal.linda&f_or_i(keep = id) end = last;
	if last then call symput("%scan(&f_or_i,1)size",_N_);
run;

%mend;
%m4cc(i);
%m4cc(f);
%appendDiscardHistory('Keep only non missing education level',&fsize,&isize);


* Remove those with age <= 18 any year;
%macro m4cd(f_or_i);

data temp.tooyoung&f_or_i;
	set bal.linda&f_or_i(keep = id year age);
	where age < 18;
run;

proc sort data = temp.tooyoung&f_or_i nodupkeys;
	by id;
run;

data bal.linda&f_or_i;
	merge bal.linda&f_or_i(in = left) temp.tooyoung&f_or_i(in = right keep = id);
	by id;
	if left and ^right;
run;

data _NULL_;
	set bal.linda&f_or_i(keep = id) end = last;
	if last then call symput("%scan(&f_or_i,1)size",_N_);
run;

%mend;
%m4cd(i);
%m4cd(f);
%appendDiscardHistory('Keep only persons with age >= 18 all years',&fsize,&isize);


* Remove those with household income < SEK 10,000 any year;
%macro m4ce(f_or_i);

data temp.toopoor&f_or_i;
	set bal.linda&f_or_i(keep = id year inchh);
	where inchh < 10000;
run;

proc sort data = temp.toopoor&f_or_i nodupkeys;
	by id;
run;

data bal.linda&f_or_i;
	merge bal.linda&f_or_i(in = left) temp.toopoor&f_or_i(in = right keep = id);
	by id;
	if left and ^right;
run;

data _NULL_;
	set bal.linda&f_or_i(keep = id) end = last;
	if last then call symput("%scan(&f_or_i,1)size",_N_);
run;

%mend;
%m4ce(i);
%m4ce(f);
%appendDiscardHistory('Keep only persons with hh income >= 10000 all years',&fsize,&isize);


* Check for persons with missing/multiple birthcountries and remove the strange ones.
  First change the codes for persons from former Yugoslavia/Czech Republic to be the same;
%macro m4cf(f_or_i);

data bal.linda&f_or_i;
	set bal.linda&f_or_i;
	if birthcountry_code in &yugo then birthcountry_code = 'YU';
	if birthcountry_code in &czech then birthcountry_code = 'CZ';
	if sp_birthcountry_code in &yugo then sp_birthcountry_code = 'YU';
	if sp_birthcountry_code in &czech then sp_birthcountry_code = 'CZ';
run;

data temp.missbc&f_or_i;
	set bal.linda&f_or_i(keep = id year birthcountry_code);
run;

data temp.missbc&f_or_i;
	set temp.missbc&f_or_i;
	by id;
	length lastbc $ 2 hasmiss 8;
	retain lastbc hasmiss;

	if first.id then do;
		lastbc = birthcountry_code;
		hasmiss = 0;
	end;	

	if birthcountry_code ^= lastbc then hasmiss = 1;
	if last.id and hasmiss then output;
	keep id;
run;

data temp.missbc&f_or_i;
	merge temp.missbc&f_or_i(in = left) bal.linda&f_or_i(keep = id year birthcountry_code);
	by id;
	if left;
run;

proc sort data = temp.missedulev&f_or_i;
	by id descending year;
run;

* Replace backward;
data temp.missbc&f_or_i;
	set temp.missbc&f_or_i;
	by id;
	length lastbc $ 2;
	retain lastbc;

	if first.id then do;
		lastbc = birthcountry_code;
	end;	

	if missing(birthcountry_code) or birthcountry_code in &nocountry then do;
		if ^(missing(lastbc) or lastbc in &nocountry) then birthcountry_code = lastbc;
	end;
	else if birthcountry_code = 'SU' then do;
		if lastbc ^= 'SU' then birthcountry_code = lastbc;
	end;
	else lastbc = birthcountry_code;
	drop lastbc;
run;

proc sort data = temp.missbc&f_or_i;
	by id year;
run;

* Replace forward;
data temp.missbc&f_or_i;
	set temp.missbc&f_or_i;
	by id;
	length lastbc $ 2;
	retain lastbc;

	if first.id then do;
		lastbc = birthcountry_code;
	end;	

	if missing(birthcountry_code) or birthcountry_code in &nocountry then do;
		if ^(missing(lastbc) or lastbc in &nocountry) then birthcountry_code = lastbc;
	end;
	else if birthcountry_code = 'SU' then do;
		if lastbc ^= 'SU' then birthcountry_code = lastbc;
	end;
	else lastbc = birthcountry_code;
	drop lastbc;
run;

data temp.missbc2&f_or_i;
	set temp.missbc&f_or_i;
	by id;
	length lastbc $ 2 hasmiss 8;
	retain lastbc hasmiss;

	if first.id then do;
		lastbc = birthcountry_code;
		hasmiss = 0;
	end;	

	if birthcountry_code ^= lastbc then hasmiss = 1;
	if last.id and hasmiss then output;
	keep id;
run;

proc sort data = temp.missbc&f_or_i nodupkeys;
	by id;
run;

proc sort data = temp.missbc2&f_or_i nodupkeys;
	by id;
run;

data bal.linda&f_or_i;
	merge bal.linda&f_or_i(in = left) temp.missbc&f_or_i(in = middle rename = (birthcountry_code = newbcc)) temp.missbc2&f_or_i(in = right);
	by id;
	if left and ^right;
	if middle then birthcountry_code = newbcc;
	drop newbcc;
run;

data _NULL_;
	set bal.linda&f_or_i(keep = id) end = last;
	if last then call symput("%scan(&f_or_i,1)size",_N_);
run;

%mend;
%m4cf(i);
%m4cf(f);
%appendDiscardHistory('Remove those with multiple birthcountries',&fsize,&isize);


* Remove those with missing hhnetw (household net wealth);
%macro m4ci(f_or_i);

data temp.miss_hhnetw&f_or_i;
	set bal.linda&f_or_i(keep = id year hhnetw);
	where missing(hhnetw);
run;

proc sort data = temp.miss_hhnetw&f_or_i nodupkeys;
	by id;
run;

data bal.linda&f_or_i;
	merge bal.linda&f_or_i(in = left) temp.miss_hhnetw&f_or_i(in = right keep = id);
	by id;
	if left and ^right;
run;

data _NULL_;
	set bal.linda&f_or_i(keep = id) end = last;
	if last then call symput("%scan(&f_or_i,1)size",_N_);
run;

%mend;
%m4ci(i);
%m4ci(f);
%appendDiscardHistory('Keep only persons with non-missing hh net wealth',&fsize,&isize);


* Join regular and immigrant linda and remove households that exist in both;
%macro m4cj;

data temp.inf;
	set bal.lindaf(keep = id);
run;

proc sort data = temp.inf nodupkeys;
	by id;
run;

data bal.linda;
	merge temp.inf(in = left) bal.lindai(in = right);
	if right and ^left;
	by id;
run;

proc append base = bal.linda data = bal.lindaf;
run;

%mend;
%m4cj;


* Check that there are no duplicates;
proc sort nodupkey data = bal.linda dupout = temp.dups;
	by id year;
run;

proc sort nodupkey data = bal.linda dupout = temp.dups2;
	by idhh year;
run;


* Separate into one dataset for each year and output;
%macro m4cl;

data out.linda1999&date out.linda2000&date out.linda2001&date out.linda2002&date out.linda2003&date out.linda2004&date out.linda2005&date out.linda2006&date out.linda2007&date;
	set bal.linda;
		where ^missing(hhnetw);
			if year = 1999 then output out.linda1999&date;
			%do y = 2000 %to 2007;
				else if year = &y then output out.linda&y&date;
			%end;
run;

* Output to excel (.csv);
%do y = 1999 %to 2007;

proc export data = out.linda&y&date
		 outfile = "DEFINE PATH HERE\linda&y&date..csv" dbms=csv replace;
run;

%end;

proc export data = bal.discardhistory
		 outfile = "DEFINE PATH HERE\discardhistory.csv" dbms=csv replace;
run;

%mend;
%m4cl;


*************************************************************************************************************************************
********************************************************* END OF CODE ***************************************************************
*************************************************************************************************************************************;




